%matplotlib inline
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Image
pd.set_option('precision', 2)
pd.set_option('notebook_repr_html', True)
pd.set_option('display.mpl_style', 'default')
import seaborn as sns
sns.set_context("talk")
sns.set_style("darkgrid")
print('Done!')
Done!
Despite some 20 years of research efforts that have focused on understanding aspects of the Internet’s infrastructure such as its router-level topology, very little is known about its individual components such as cell towers, routers or switches, and fiber-optic cables which are concrete entities with well-defined geographic locations.
This data set provides a first-of-its-kind map of the US long haul fiber infrastructure. The details of the connectivity and shared use of conduits has been verified using public records on rights-of-way. The dataset is made available through PREDICT and includes the image of the map as well as a table that provides all details on connectivity represented in the map. https://www.predict.org/
Image("./InterTubes-Dataset/long-haul.jpg", width=750)
sharing_url = "https://s3-us-west-2.amazonaws.com/telecoms-analytics-dataset/sharing.txt"
shared_conduit = pd.read_csv(sharing_url, delim_whitespace=True, index_col=0, header=None)
shared_conduit.index.name = 'ISP Names'
# Initialize first column to ISP and set other columns to Sn => "Number of shared conduits"
col_names = []
for k in np.arange(1,shared_conduit.columns.size+1):
col_names.append('S'+str(k))
# Set the columns names and display
shared_conduit.columns = col_names
shared_conduit.head(3)
| S1 | S2 | S3 | S4 | S5 | S6 | S7 | S8 | S9 | S10 | ... | S533 | S534 | S535 | S536 | S537 | S538 | S539 | S540 | S541 | S542 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISP Names | |||||||||||||||||||||
| ATT | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Verizon | 0 | 0 | 0 | 10 | 0 | 18 | 11 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| DeutscheTelekom | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 rows × 542 columns
sum_across_conduit = shared_conduit.mean(axis=1)
sum_across_conduit.sort_values(ascending=False, inplace=True)
f, ax = plt.figure(figsize=(15,10)); plt.xlabel("TOTAL of conduits used by each ISP")
sns.barplot(y=sum_across_conduit.index, x=sum_across_conduit.values, palette='BuPu_d')
<matplotlib.axes._subplots.AxesSubplot at 0x113981080>
trans_shared_conduit = shared_conduit.transpose().copy()
plt.figure(figsize=(15,7)); plt.xlim(-0.5,20)
top_x = sum_across_conduit.index.values[:10]
ax = sns.boxplot(data=trans_shared_conduit[top_x], orient='h', palette='Set1')
ax = sns.stripplot(data=trans_shared_conduit[top_x], orient='h', palette='Set1')
ISP share each conduit?¶f, axes = plt.subplots(1,2, figsize=(20, 7)); df2 = df_conduit_counts.copy()
ax = sns.boxplot(x='ISP in Conduits', data=df2, palette='Reds_d', ax=axes[1], whis=19)
ax = sns.countplot(x='ISP in Conduits', data=df2, palette='Reds_d', ax=axes[0])
print("\nMin. : {} \t Max. : {} \t Median : {} \t Q1 : {} \t Q2 : {} \t Q3 : {} "
.format(df.min(), df.max(), np.median(df), first, secnd, third))
Min. : 1 Max. : 19 Median : 4.0 Q1 : 2.0 Q2 : 4.0 Q3 : 8.0
Location (city;state)¶# Extract dataset into dataframe and verify that there are 273 nodes
column_names = ['Nodes_City_State']
node_url = "https://s3-us-west-2.amazonaws.com/telecoms-analytics-dataset/nodes.txt"
node_city_state = pd.read_csv(node_url, sep='\t', header=None, index_col=0, names=column_names)
node_city_state.index.name = 'Location_Index'
print(node_city_state.shape)
node_city_state.head(3)
(273, 1)
| Nodes_City_State | |
|---|---|
| Location_Index | |
| 1 | Abilene;Texas |
| 2 | Akron;Ohio |
| 3 | Albany;New_York |
Fiber Optics conduit dataset¶column_names = ['Source', 'Sink']
# edges_url = "./InterTubes-Dataset/links.txt"
edges_url = "https://s3-us-west-2.amazonaws.com/telecoms-analytics-dataset/links.txt"
edges_src_snk = pd.read_csv(edges_url, names=column_names, header=None)
print("\nNumber of links (edges or conduit) : ", edges_src_snk.index.size)
print(edges_src_snk.shape)
edges_src_snk.head()
Number of links (edges or conduit) : 542 (542, 2)
| Source | Sink | |
|---|---|---|
| 0 | 105 | 21 |
| 1 | 105 | 223 |
| 2 | 105 | 230 |
| 3 | 223 | 21 |
| 4 | 223 | 111 |
Names for Index in both Source and Sink¶on_Source = pd.merge(edges_src_snk, node_city_state, left_on=['Source'], right_index=True)
on_Source.head(2)
| Source | Sink | Nodes_City_State | |
|---|---|---|---|
| 0 | 105 | 21 | Harbour_Pointe;Washington |
| 1 | 105 | 223 | Harbour_Pointe;Washington |
# Merge the Source, Sink with the corresponding codes in both dataframes
on_Source = pd.merge(edges_src_snk, node_city_state, left_on=['Source'], right_index=True)
source_sink = pd.merge(on_Source, node_city_state, left_on=['Sink'], right_index=True)
# Rename the resulting columns and sort the index 'inplace'
column_names = ['Source_Index', 'Sink_Index', 'Outgoing', 'Incoming']
source_sink.columns = column_names
source_sink.sort_index(inplace=True)
# Get the frequency of Incoming and Outgoing sites and merge into a single Dataframe
all_incoming_outgoing = pd.merge(pd.DataFrame(source_sink['Incoming'].value_counts()),
pd.DataFrame(source_sink['Outgoing'].value_counts()), left_index=True, right_index=True)
all_incoming_outgoing.head()
| Incoming | Outgoing | |
|---|---|---|
| Denver;Colorado | 5 | 12 |
| Dallas;Texas | 4 | 11 |
| Salt_Lake_City;Utah | 2 | 11 |
| Atlanta;Georgia | 7 | 8 |
| Chicago;Illinois | 5 | 8 |
Locations have the HIGHEST INCOMING fiber Links?¶highest_incoming_loc = all_incoming_outgoing.sort_values('Incoming', ascending=False).head(5)
highest_incoming_loc.plot(kind='bar', figsize=(15,6), stacked=True, title='Top 5 Locations with HIGHEST INCOMING Fiber-optics Connections')
<matplotlib.axes._subplots.AxesSubplot at 0x11a1e38d0>
lowest_incoming_loc = all_incoming_outgoing.sort_values('Incoming').head(5)
lowest_incoming_loc.plot(kind='bar', figsize=(15,6), stacked=True, title='Top 5 Locations with LOWEST INCOMING Fiber-optics Connections')
<matplotlib.axes._subplots.AxesSubplot at 0x109291dd8>
Locations with HIGHEST OUTGOING fiber connections?¶# Sort the dataframe on Outgoing connections
highest_outgoing_loc = all_incoming_outgoing.sort_values(['Outgoing'], ascending=False).head(5)
highest_outgoing_loc = highest_outgoing_loc.reindex(columns=['Outgoing', 'Incoming'])
highest_outgoing_loc.head()
| Outgoing | Incoming | |
|---|---|---|
| Denver;Colorado | 12 | 5 |
| Salt_Lake_City;Utah | 11 | 2 |
| Dallas;Texas | 11 | 4 |
| Atlanta;Georgia | 8 | 7 |
| Chicago;Illinois | 8 | 5 |
highest_outgoing_loc.head(5).plot(kind='bar', figsize=(15,6), stacked=True,
title='Top 5 Locations with highest OUTGOING Connections')
<matplotlib.axes._subplots.AxesSubplot at 0x119db7048>
lowest_outgoing_loc.head(5).plot(kind='bar', figsize=(15,6), stacked=True,
title='Top 5 Locations with LOWEST OUTGOING Connections')
<matplotlib.axes._subplots.AxesSubplot at 0x119dd8470>
States have multiple installations in the associated Cities?¶Cities and States are not explicitly separated in our data. But we can quickly check if multiple States appear in the Location names.
source_sink.groupby(['Incoming']).size().head(7)
# We see here that not only does each city have multiple fiber locations, but some states also have
# installations in multiple cities. We need to separate the locations by Cities and States.
Incoming Abilene;Texas 1 Akron;Ohio 3 Albany;New_York 3 Albuquerque;New_Mexico 3 Allentown;Pennsylvania 2 Altoona;Pennsylvania 1 Amarillo;Texas 2 dtype: int64
Cities and States columns to the Dataframe¶source_sink['Incoming_City'] = ''; source_sink['Outgoing_City'] = '';
source_sink['Incoming_State'] = ''; source_sink['Outgoing_State'] = '';
for ndx in source_sink.index:
# Split both incoming and outgoing on colon to City and State
incoming = source_sink.Incoming[ndx].split(';')
outgoing = source_sink.Outgoing[ndx].split(';')
# Populate the Incoming Cities
source_sink['Incoming_City'].values[ndx] = incoming[0]
source_sink['Outgoing_City'].values[ndx] = outgoing[0]
# If the State is mising, use the City e.g. Washington DC
source_sink['Incoming_State'].values[ndx] = incoming[1] if len(incoming) > 1 else incoming[0]
source_sink['Outgoing_State'].values[ndx] = outgoing[1] if len(outgoing) > 1 else outgoing[0]
# Form new dataframe
new_source_sink = source_sink.drop(['Source_Index', 'Sink_Index', 'Incoming', 'Outgoing'], axis=1)
# We check those Cities and State we just munged
# source_sink[source_sink.Incoming_City == source_sink.Incoming_State]
# source_sink[source_sink.Outgoing_City == source_sink.Outgoing_State]
new_source_sink.head()
| Incoming_City | Outgoing_City | Incoming_State | Outgoing_State | |
|---|---|---|---|---|
| 0 | Bellevue | Harbour_Pointe | Washington | Washington |
| 1 | Seattle | Harbour_Pointe | Washington | Washington |
| 2 | Spokane | Harbour_Pointe | Washington | Washington |
| 3 | Bellevue | Seattle | Washington | Washington |
| 4 | Hillsboro | Seattle | Oregon | Washington |
Top 5 and Lowest 5 - Cities with Average Link Connection Difference¶city_in_out_diff = pd.merge(pd.DataFrame(new_source_sink['Incoming_City'].value_counts()),
pd.DataFrame(new_source_sink['Outgoing_City'].value_counts()),
left_index=True, right_index=True)
city_in_out_diff['City_Difference'] = ((city_in_out_diff.Incoming_City -
city_in_out_diff.Outgoing_City)/city_in_out_diff.Incoming_City) * 100
city_in_out_diff = city_in_out_diff.sort_values('City_Difference', ascending=False)
top5_low5 = [city_in_out_diff.head(), city_in_out_diff.tail()]
top5_low5 = pd.concat(top5_low5)
top5_low5;
ax = plt.figure(figsize=(13,7))
sns.barplot(x='City_Difference', y=top5_low5.index.values, data=top5_low5, label='City_Difference')
plt.xlabel('Average Link Utilization Difference (%)')
plt.title('Incoming and Outgoing Links DIFFERENCES (in %) for few locations')
<matplotlib.text.Text at 0x119491198>
Top 5 and Lowest 5 STATES with Average Link Connection Difference¶state_in_out_diff = pd.merge(pd.DataFrame(new_source_sink['Incoming_State'].value_counts()),
pd.DataFrame(new_source_sink['Outgoing_State'].value_counts()),
left_index=True, right_index=True)
state_in_out_diff['State_Difference'] = ((state_in_out_diff.Incoming_State -
state_in_out_diff.Outgoing_State)/state_in_out_diff.Incoming_State) * 100
state_in_out_diff = state_in_out_diff.sort_values('State_Difference', ascending=False)
print(state_in_out_diff.head(3))
print("\n")
print(state_in_out_diff.tail(3))
Incoming_State Outgoing_State State_Difference
Nevada 8 2 75.0
Nebraska 5 2 60.0
Massachusetts 8 5 37.5
Incoming_State Outgoing_State State_Difference
Missouri 2 6 -200
Iowa 2 6 -200
Alabama 1 6 -500
ax = plt.figure(figsize=(20,16))
sns.barplot(x='State_Difference', y=state_in_out_diff.index[:40], data=state_in_out_diff.head(40),
label='State_Difference', palette='GnBu_d')
plt.xlabel('Average Link Utilization Difference by STATE(%)')
plt.title('Incoming and Outgoing Link DIFFERENCES (in %) for different States')
<matplotlib.text.Text at 0x10f55ca58>
city_in_out_diff['City_Difference'] = city_in_out_diff.Incoming_City - city_in_out_diff.Outgoing_City
sns.pairplot(city_in_out_diff, diag_kind="kde", markers="+", kind="reg", diag_kws=dict(shade=True))
<seaborn.axisgrid.PairGrid at 0x10cb97be0>
f, axes = plt.subplots(1,3, figsize=(20, 5))
sns.regplot("Incoming_City", "Outgoing_City", data=city_in_out_diff, color='darkorange', ax=axes[0])
sns.regplot("Incoming_City","City_Difference", data=city_in_out_diff, color='brown', ax=axes[1])
sns.regplot("Outgoing_City","City_Difference", data=city_in_out_diff, color='purple', ax=axes[2])
<matplotlib.axes._subplots.AxesSubplot at 0x11bedee10>
state_in_out_diff['State_Difference'] = state_in_out_diff.Incoming_State - state_in_out_diff.Outgoing_State
state_in_out_diff = state_in_out_diff.sort_values('State_Difference', ascending=False)
# Plot regression line and a matrix of correlation plots for the data
sns.pairplot(state_in_out_diff, diag_kind="kde", markers="+", kind="reg", diag_kws=dict(shade=True))
<seaborn.axisgrid.PairGrid at 0x10ed69d30>
Pearson r is a measure of the linear correlation or dependency between two variables. When Linear equation describes the relationship between X and Y perfectly, Pearson is = 1.
p1 = sns.jointplot("Incoming_State", "Outgoing_State", data=state_in_out_diff, kind='reg', color='darkblue')
p2 = sns.jointplot("Incoming_State","State_Difference", data=state_in_out_diff, kind='reg', color='darkred')
p3 = sns.jointplot("Outgoing_State","State_Difference", data=state_in_out_diff, kind='reg', color='darkgreen')
p1.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_out_vs_in")
p2.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_in_vs_diff")
p3.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_out_vs_diff")
We need to encode the US States using Postal Code before we can overlay on an interactive map. PostalCode is not provided in data.
http://www.infoplease.com/ipa/A0110468.html contains list of US States and relevant PostalCode. We go ahead and scrape the page. Clean the scraped data and merge with our existing dataframe.
# Scrape web data for US States Abbrevations and Postal Codes. http://www.infoplease.com/ipa/A0110468.html
def postalcode_scraper():
""" Scrapes a known URL and returns a Pandas dataframe containing
list of US States as index and their Postal Codes as column values.
"""
# URL to scrape
data_url = "http://www.infoplease.com/ipa/A0110468.html"
# Scrape the page with Pandas
table_scrape = pd.read_html(data_url)[1]
# Subtitute spaces in State names with '_' to confirm with existing dataset
table_scrape[0] = [tab.replace(' ','_') for tab in table_scrape[0] ]
# Reindex using corrected state names to confirm to existing dataframe formats
table_scrape.index = table_scrape[0]
# Remove extranous data elements and unneeded columns
table_scrape.drop([0,1], axis=1, inplace=True)
table_scrape.drop('State', inplace=True)
# Set column and index names.
table_scrape.index.name = 'State'
table_scrape.columns = ['PostalCode']
return table_scrape
postalcode_table = postalcode_scraper()
postalcode_table.head()
| PostalCode | |
|---|---|
| State | |
| Alabama | AL |
| Alaska | AK |
| American_Samoa | AS |
| Arizona | AZ |
| Arkansas | AR |
state_map = state_in_out_diff[['Incoming_State', 'Outgoing_State']].copy()
state_map['State_Difference'] = state_map.Incoming_State - state_map.Outgoing_State
state_map = pd.merge(state_map, postalcode_table, left_index=True, right_index=True, how='left')
# check those state with missing Postal code and encode them manually.
# print(state_map[state_map.PostalCode.isnull()])
# Since Washington DC has no state, we can directly encode as 'DC
state_map.ix['Washington_DC'] = state_map.ix['Washington_DC'].fillna('DC')
# # Then check again...
print(state_map.ix['Washington_DC'])
state_map['US_States'] = state_map.index
state_map.index = state_map.PostalCode
state_map.drop('PostalCode', axis=1, inplace=True)
state_map.head(2)
Incoming_State 9 Outgoing_State 6 State_Difference 3 PostalCode DC Name: Washington_DC, dtype: object
| Incoming_State | Outgoing_State | State_Difference | US_States | |
|---|---|---|---|---|
| PostalCode | ||||
| NV | 8 | 2 | 6 | Nevada |
| NE | 5 | 2 | 3 | Nebraska |
def find_states_missing(df):
""" Returns dataframe of states missing from our dataset """
states_missing = []
postcodes = postalcode_scraper()
for st in postcodes.PostalCode:
if st not in df.index:
d = postcodes.loc[postcodes['PostalCode'] == st]
states_missing.append(d)
states_missing = pd.concat(states_missing)
states_missing['US_States'] = states_missing.index
states_missing.index = states_missing.PostalCode.values
states_missing.drop('PostalCode', axis=1, inplace=True)
return states_missing
# Call the functions and create relevant dataset
missing_states = find_states_missing(state_map)
missing_states.head(3)
| US_States | |
|---|---|
| AK | Alaska |
| AS | American_Samoa |
| DE | Delaware |
from collections import OrderedDict
from bokeh.sampledata import us_states, us_cities
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, hplot, vplot
output_notebook()
def assign_color(code):
""" Assign specific color contrast to state based on link difference. """
link_diff = state_map.State_Difference.ix[code]
if link_diff < 0: # Outgoing more than incoming links
colr = "#ED8466" # Negative => Light Red
else: # incoming link > outgoing, Link diff > 0.
colr = colors[min(link_diff, len(colors))] # Assign one of the color contrasts
return colr
state_names = []
state_colors = []
state_incoming_link = []
state_outgoing_link = []
usa_states = us_states.data.copy()
del usa_states['AK'];
del usa_states['HI'];
state_map = state_map.sort_values('State_Difference', ascending=False)
missing_states = find_states_missing(state_map)
us_state_xs = [usa_states[code]["lons"] for code in usa_states]
us_state_ys = [usa_states[code]["lats"] for code in usa_states]
colors = ["#D1E2F2", "#ADCCE5", "#77B0D4", "#448BC0", "#449CC0",
"#2B62B2","#2264AB", "#0D408E", "#294F70", "#273A48"]
# Iterate over US Long/Lat list.
for code in usa_states:
# If code exist for infrastructure state
if code in state_map.index.values:
state_colors.append(assign_color(code))
state_names.append(state_map.US_States.ix[code])
state_incoming_link.append(state_map.Incoming_State.ix[code])
state_outgoing_link.append(state_map.Outgoing_State.ix[code])
else:
# No link in State
state_colors.append('#979383') # Shade of gray
state_names.append(missing_states.US_States.ix[code])
state_incoming_link.append(0)
state_outgoing_link.append(0)
source = ColumnDataSource(
data = dict(
x=us_state_xs,
y=us_state_ys,
color=state_colors,
name=state_names,
incoming=state_incoming_link,
outgoing=state_outgoing_link
))
TOOLS="pan,wheel_zoom,box_zoom,reset,hover,save"
plot = figure(tools=TOOLS, plot_width=900, plot_height=600, toolbar_location='left')
plot.title=('Fiber-Optics Coverage within United States')
plot.patches('x', 'y', fill_color='color', line_color="#333333", line_width=0.5, source=source)
# Configure the tooltips
hover = plot.select(dict(type=HoverTool))
hover.point_policy = "follow_mouse"
hover.tooltips = OrderedDict([
("Name ", "@name"),
("Incoming Fiber Links ", " @incoming"),
("Outgoing Fiber Links ", " @outgoing")
])
show(plot)
# Image("../turaquo/static/img/fiber-optix-links.png")
Bokeh, Flask & Jinja2¶# Create skeletal jinja2 HTML template
import jinja2
template = jinja2.Template("""
<!DOCTYPE html>
<html lang="en-US">
<link
href="http://cdn.pydata.org/bokeh/release/bokeh-0.9.0.min.css"
rel="stylesheet" type="text/css"
>
<script
src="http://cdn.pydata.org/bokeh/release/bokeh-0.9.0.min.js"
></script>
<body>
<h1>Hello Bokeh!</h1>
<p> Below is a simple plot of stock closing prices </p>
{{ script }}
{{ div }}
</body>
</html>
""")
from IPython.display import HTML
from bokeh.plotting import figure
from bokeh.embed import components
from bokeh.palettes import brewer
from bokeh._legacy_charts import Bar, Histogram
from bokeh.io import output_notebook, hplot, vplot
output_notebook()
from bokeh.sampledata.autompg import autompg as df
TOOLS = "resize,pan,wheel_zoom,box_zoom,reset,previewsave"
city_diff = pd.DataFrame(top5_low5.City_Difference).transpose()
# RdYlBu
mycolrs = brewer["Spectral"][10]
bar = Bar(city_diff, city_diff.index.tolist(), stacked=False, responsive=True,
legend="bottom_left", height=450, width=900, tools=TOOLS, palette=mycolrs)
/Users/RichardAfolabi/anaconda/lib/python3.5/site-packages/bokeh/_legacy_charts/_chart.py:92: UserWarning: Instantiating a Legacy Chart from bokeh._legacy_charts
warn("Instantiating a Legacy Chart from bokeh._legacy_charts")
script, div = components(bar)
HTML(template.render(script=script, div=div))
Below is a simple plot of stock closing prices
Bokeh web object graphics components and embed into HTML¶